
#Within data
rm(list=ls())
set.seed(424242)
source('lib-00-utils.R')
library(ggplot2)
library(dplyr)
library(data.table)
library(lmtest)
library(lme4)
library(texreg)
library(lmerTest)
library(xtable)
library(lfe)
library(tidyjson)
library(jsonlite)
library(reshape)
library(regtools)
library(ggeffects)
library(broom)

figPath <- "/gscratch/comdata/users/kaylea/anon_underprod/figures/"
dataPath <- "/gscratch/comdata/users/kaylea/anon_underprod/knitr_rdata/"

load('cleanMassJoinW.RData') #contains wDF 
source('helpers.R')

### these sample weights are copied in manually, wrapped in c() parens and ""quotes, and treated as json

wWeights <- '{"12": 0.0361794500723589, "1": 0.000015656481759963903, "13": 0.06596306068601583, "6": 0.0010317042722873915, "16": 0.5780346820809249, "3": 0.00009731100499886633, "20": 1, "5": 0.0004694438967598982, "19": 1, "15": 0.25510204081632654, "9": 0.007566013467503972, "17": 1, "4": 0.00021600278211583366, "8": 0.004207868714496108, "7": 0.0022088708252341405, "10": 0.013073604392731076, "21": 1, "11": 0.021834061135371178, "14": 0.125, "2": 0.00004010271106357603, "0": 0.00000707171868363634, "18": 1}'


wWeightDF <- wWeights %>% gather_object('bucket_size_2') %>% append_values_number("sampleProb")
wWeightDF$..JSON <-NULL #cleanup
wWeightDF$document.id <-NULL
wWeightDF <- data.frame(wWeightDF)
wWeightDF$weights <- 1/wWeightDF$sampleProb

#these are fitted with wDF 
wDF <- merge(x=wDF, y=wWeightDF, by="bucket_size_2", all.x=TRUE)
panelWDF <- wDF[wDF$bucket_size_2 != "0",] #remove all cases where they made a single edit
pulledFromPanelWDF <- wDF[wDF$bucket_size_2 == "0",] #how many did we pull to make our panel

panelWDF.weighted <- panelWDF
panelWDF.weighted$underprod_factor <- -log(as.numeric(panelWDF.weighted$viewRankInMyMonth)/as.numeric(panelWDF.weighted$qualityRankInMyMonth))
max.wDF.uf <- max(panelWDF.weighted$underprod_factor)
min.wDF.uf <- min(panelWDF.weighted$underprod_factor)
##########
##########
###### NOTE: ranks are such that 1 is best.
####### equation for this kind of ranking needs a MINUS LOG
####### in order to be consistent with saner paper, where a highly positive up.fac.mean is severe underprod
######## - log( rank 1 view / rank 500 quality == -log (1/500) == 6.2 == underproduced
##########
##########
noAnonsPanelWDF.weighted=panelWDF.weighted[panelWDF$anon==FALSE,]

panelWDF <- NULL 
wDF <- NULL
pulledFromPanelWDF <- NULL

print("run panel models")


n.edits.wDF <- format(length(panelWDF.weighted$revid), format=f, big.mark=",")

## felm is response ~ covariates | factors
summary(m1_withinPerson.w <- felm(underprod_factor ~ poly(logEditorNthEdit, 2, raw = TRUE) | editor_id_or_ip , data=panelWDF.weighted, weights=panelWDF.weighted$weights), robust=TRUE)
summary(m2_withinPerson.w <- felm(underprod_factor ~ poly(logEditorNthEdit,2, raw=TRUE) | editor_id_or_ip , data=noAnonsPanelWDF.weighted, weights=noAnonsPanelWDF.weighted$weights), robust=TRUE)

#without rse:
summary(m1_withinPerson.w)
summary(m2_withinPerson.w)

#without poly
summary(m1_withinPerson.w.np <- felm(underprod_factor ~ logEditorNthEdit | editor_id_or_ip , data=panelWDF.weighted, weights=panelWDF.weighted$weights), robust=TRUE)
summary(m2_withinPerson.w.np <- felm(underprod_factor ~ logEditorNthEdit | editor_id_or_ip , data=noAnonsPanelWDF.weighted, weights=noAnonsPanelWDF.weighted$weights), robust=TRUE)

onlyAnonsPanelWDF.weighted=panelWDF.weighted[panelWDF.weighted$anon==TRUE,]
summary(m3_withinPerson.w <- felm(underprod_factor ~ poly(logEditorNthEdit,2, raw=TRUE) | editor_id_or_ip , data=onlyAnonsPanelWDF.weighted, weights=onlyAnonsPanelWDF.weighted$weights), robust=TRUE)

#without rse
summary(m3_withinPerson.w)

#without poly
summary(m3_withinPerson.w.np <- felm(underprod_factor ~ logEditorNthEdit | editor_id_or_ip , data=onlyAnonsPanelWDF.weighted, weights=onlyAnonsPanelWDF.weighted$weights), robust=TRUE)


## evaluate using a poly model vs a linear model
## r2 is only marginally better but AIC is better with polynomial
## felm doesn't offer bic calculation

# Get the AIC from the model summary
AIC_value.m1 <- AIC(m1_withinPerson.w)
AIC_value.m1.np <- AIC(m1_withinPerson.w.np)
AIC_value.m2 <- AIC(m2_withinPerson.w)
AIC_value.m2.np <- AIC(m2_withinPerson.w.np)
AIC_value.m3 <- AIC(m3_withinPerson.w)
AIC_value.m3.np <- AIC(m3_withinPerson.w.np)


# Get the number of observations in the data
num_observations <- nrow(panelWDF.weighted)
num_observations.account <- nrow(noAnonsPanelWDF.weighted)
num_observations.anon <- nrow(onlyAnonsPanelWDF.weighted)

# Get the number of parameters (including the intercept)
num_parameters.m1 <- length(coefficients(m1_withinPerson.w))
num_parameters.m1.np <- length(coefficients(m1_withinPerson.w.np))
num_parameters.m2 <- length(coefficients(m2_withinPerson.w))
num_parameters.m2.np <- length(coefficients(m2_withinPerson.w.np))
num_parameters.m3 <- length(coefficients(m3_withinPerson.w))
num_parameters.m3.np <- length(coefficients(m3_withinPerson.w.np))

# Calculate the BIC
BIC.m1 <- AIC_value.m1 + log(num_observations) * num_parameters.m1
BIC.m1.np <- AIC_value.m1.np + log(num_observations) * num_parameters.m1.np
BIC.m2 <- AIC_value.m2 + log(num_observations.account) * num_parameters.m2
BIC.m2.np <- AIC_value.m2.np + log(num_observations.account) * num_parameters.m2.np
BIC.m3 <- AIC_value.m3 + log(num_observations.anon) * num_parameters.m3
BIC.m3.np <- AIC_value.m3.np + log(num_observations.anon) * num_parameters.m3.np

# Print the BIC value
print(BIC.m1)
print(BIC.m1.np)
print(BIC.m2)
print(BIC.m2.np)
print(BIC.m3)
print(BIC.m3.np)

### note that both AIC and BIC are lower with the polynomial model
### although R2 is only marginally better with the poly


bp.pv.m1 <- format.pval(bptest(m1_withinPerson.w)$p.value, digits=2)
bp.pv.m2 <- format.pval(bptest(m2_withinPerson.w)$p.value, digits=2)
bp.pv.m3 <- format.pval(bptest(m3_withinPerson.w)$p.value, digits=2)

#screenreg(list(m1_withinPerson.w, m2_withinPerson.w, m3_withinPerson.w), digits=4, omit.coef = 'factor', ci.force = TRUE)

## to make a nice table with RSE we need to manually extract the RSE from summary, ugh
get_coefficients_with_rse <- function(felm_model) { 
  model_summary <- summary(felm_model, robust = TRUE)
  coefs_felm <- model_summary$coefficients[, 1]
  rse <- model_summary$coefficients[, 2]
  coefs_with_rse <- cbind(coefs_felm, rse)
  colnames(coefs_with_rse)[2] <- "RSE"
  return(coefs_with_rse)
}

# Extract coefficients and standard errors for all three models
coefs_with_se_model1 <- get_coefficients_with_rse(m1_withinPerson.w)
coefs_with_se_model1.np <- get_coefficients_with_rse(m1_withinPerson.w.np)
rownames(coefs_with_se_model1.np) <- "poly(logEditorNthEdit, 2, raw = TRUE)1"
coefs_with_se_model2 <- get_coefficients_with_rse(m2_withinPerson.w)
coefs_with_se_model2.np <- get_coefficients_with_rse(m2_withinPerson.w.np)
rownames(coefs_with_se_model2.np) <- "poly(logEditorNthEdit, 2, raw = TRUE)1"
coefs_with_se_model3 <- get_coefficients_with_rse(m3_withinPerson.w)
coefs_with_se_model3.np <- get_coefficients_with_rse(m3_withinPerson.w.np)
rownames(coefs_with_se_model3.np) <- "poly(logEditorNthEdit, 2, raw = TRUE)1"

# Combine all coefficient tables into one
combined_coefs <- rbind(coefs_with_se_model1.np, coefs_with_se_model2.np, coefs_with_se_model3.np, coefs_with_se_model1, coefs_with_se_model2, coefs_with_se_model3) ## note that order matters

# Define model names for texreg
model_names <- c('Within-Person: All', 'With Account Only', 'Without Account Only', 'All, Quadratic', 'With Account Only, Quadratic', 'Without Account Only, Quadratic') ##same order as previous!

# Define custom coefficient names
#custom_coef_names <- c('Revision Count (ln)', 'Revision Count (ln$^2$)')
#custom_coef_names <- c('Revision Count (ln)')

# Create the texreg table with the combined results and all the specified options

models_list <- list(m1_withinPerson.w.np, m2_withinPerson.w.np, m3_withinPerson.w.np, m1_withinPerson.w, m2_withinPerson.w, m3_withinPerson.w)
coefs_list <- list(combined_coefs)

con <- textConnection("withinAlignTexWeighted", "w")
sink(con, split=TRUE, type="output")
texreg(models_list, coefs = coefs_list,
                         ci.test = NULL,
                         digits = 4,
                         omit.coef = 'factor',
                         use.packages = FALSE,
#                         custom.coef.names = custom_coef_names,
                         custom.model.names = model_names,
                         table = FALSE,
                         include.adjrs = FALSE,
                         ci.force = TRUE)
sink()
close(con);rm(con)



#same table but doesn't use robust standard errors
con <- textConnection("withinAlignTexWeighted.nr", "w")
sink(con, split=TRUE, type="output")
texreg(list(m1_withinPerson.w, m2_withinPerson.w, m3_withinPerson.w), 
ci.test=NULL,
       digits=4, 
       omit.coef = 'factor', 
       use.packages=FALSE,
       custom.coef.names = c('Revision Count (ln)', 'Revision Count (ln$^2$)'), 
       custom.model.names = c('Within-Person: All', 'With Account Only', 'Without Account Only'), 
       table=FALSE,
       include.adjrs=FALSE, #don't need
       ci.force = TRUE)
sink()
close(con);rm(con)

## margeff for poly model
## make a DF just for the prediction task
fixeDF <- getfe(m1_withinPerson.w)
fixeDF.i <- getfe(m2_withinPerson.w)
fixeDF.a <- getfe(m3_withinPerson.w)
summary(fixeDF)
quantile(fixeDF$effect, c(0.05, 0.5, 0.95))
quantile(fixeDF$obs, c(0.05, 0.5, 0.95))

q50.i <- quantile(fixeDF.i$effect, 0.5)
q50.a <- quantile(fixeDF.a$effect, 0.5)

#q05 <- quantile(fixeDF$effect, 0.05)
#q95 <- quantile(fixeDF$effect, 0.95)

logSeq = seq(from=log(quantile(fixeDF$obs, 0.05)), to=log(quantile(fixeDF$obs, 0.95)), by=0.5) 

simWDF <- data.frame(`poly(logEditorNthEdit, 2, raw = TRUE)1`=logSeq, `poly(logEditorNthEdit, 2, raw = TRUE)2`=logSeq^2)

predWDF.i <- as.data.frame(as.matrix(simWDF) %*% coef(m2_withinPerson.w)) ##this is where the predict happens
predWDF.a <- as.data.frame(as.matrix(simWDF) %*% coef(m3_withinPerson.w)) ## and here
predWDF.i$q50 <- predWDF.i$V1 + q50.i
predWDF.a$q50 <- predWDF.a$V1 + q50.a
#predWDF$q05 <- predWDF$V1 + q05 #if we wanted to do 5-95 intervals
#predWDF$q95 <- predWDF$V1 + q95
predWDF.i$logEditorNthEdit <- simWDF$poly.logEditorNthEdit..2..raw...TRUE.1 #naming has been coerced over
predWDF.a$logEditorNthEdit <- simWDF$poly.logEditorNthEdit..2..raw...TRUE.1 #naming has been coerced over

predWDF.i$anon <- FALSE
predWDF.a$anon <- TRUE
predWDF <- rbind(predWDF.i, predWDF.a)

#### margeff for linear model

fixeDF.np <- getfe(m1_withinPerson.w.np)
fixeDF.i.np <- getfe(m2_withinPerson.w.np)
fixeDF.a.np <- getfe(m3_withinPerson.w.np)
summary(fixeDF.np)
quantile(fixeDF.np$effect, c(0.05, 0.5, 0.95))
quantile(fixeDF.np$obs, c(0.05, 0.5, 0.95))

q50.i.np <- quantile(fixeDF.i.np$effect, 0.5)
q50.a.np <- quantile(fixeDF.a.np$effect, 0.5)

simWDF.np <- data.frame(logEditorNthEdit=logSeq)

predWDF.i.np <- as.data.frame(as.matrix(simWDF.np) %*% coef(m2_withinPerson.w.np)) ##this is where the predict happens
predWDF.a.np <- as.data.frame(as.matrix(simWDF.np) %*% coef(m3_withinPerson.w.np)) ## and here
predWDF.i.np$q50 <- predWDF.i.np$V1 + q50.i.np
predWDF.a.np$q50 <- predWDF.a.np$V1 + q50.a.np
#predWDF$q05 <- predWDF$V1 + q05 #if we wanted intervals
#predWDF$q95 <- predWDF$V1 + q95
predWDF.i.np$logEditorNthEdit <- simWDF.np$logEditorNthEdit
predWDF.a.np$logEditorNthEdit <- simWDF.np$logEditorNthEdit

predWDF.i.np$anon <- FALSE
predWDF.a.np$anon <- TRUE
predWDF.np <- rbind(predWDF.i.np, predWDF.a.np)


## pause here and see what I have



fixeDF.clean <- data.frame('effect' = fixeDF$effect)

save(predWDF, predWDF.np, fixeDF.clean, file=paste0(dataPath, "weightedWithinPreds.RData"), version=2)

save(m1_withinPerson.w, m2_withinPerson.w, m3_withinPerson.w, file=paste0(dataPath, "weightedWithinModels.RData"), version=2)


if (!nosave) {
r <- list()

remember(max.wDF.uf)
remember(min.wDF.uf) 
remember(withinAlignTexWeighted)
remember(n.edits.wDF)
remember(bp.pv.m1) #heteroskedasticity test outcomes, pv < .05 means heteroskedastic 
remember(bp.pv.m2) 
remember(bp.pv.m3) 

save(r, file=paste0(dataPath, "within_weighted.RData"), version=2)
print("Remembrances complete.")
rm(r)

}

## downsample within data to allow for some EDA

allPeople <- unique(panelWDF.weighted$editor_id_or_ip)
peopleSample <- sample_n(data.frame(ident=allPeople), 100)

miniWDF <- subset(panelWDF.weighted, panelWDF.weighted$editor_id_or_ip %in% peopleSample)
save(miniWDF, file=(paste0(dataPath, "wDF_for_eda.RData")), version=2)


#Between data -- i.e. the random sample


rm(list=ls())
set.seed(424242)
source('lib-00-utils.R')
library(dplyr)
library(data.table)
library(ggplot2)
library(ggeffects)
library(lfe)
library(lme4)
library(lmerTest)
library(lmtest)
library(texreg)
library(jsonlite)
library(reshape)
library(regtools)
library(tidyjson)
library(xtable)
library(performance)

##remember to rename B data to below
load('cleanMassJoinB.RData') #contains bDF
source('helpers.R')
figPath <- "/gscratch/comdata/users/kaylea/anon_underprod/figures/"
dataPath <- "/gscratch/comdata/users/kaylea/anon_underprod/knitr_rdata/"



bWeights <- '{"12": 0.006981729790580222, "22": 0.11248745764847219, "1": 0.00677854749690949, "13": 0.005911202041161709, "16": 0.005860576074702185, "6": 0.012516236688043565, "3": 0.010274539813636288, "20": 0.015664970150399377, "5": 0.012025746642531795, "19": 0.010674163060222773, "15": 0.005286750156434937, "9": 0.011171209737005146, "17": 0.0075285711155979726, "4": 0.01129758018259827, "8": 0.01221286635240519, "7": 0.012651522541154455, "10": 0.009790729999766002, "21": 0.01838770024665261, "11": 0.008343261118792937, "14": 0.00533530953198398, "2": 0.008703808778017451, "0": 0.005075825730253972, "18": 0.009385229019766324}'
bWeightDF <- bWeights %>% gather_object('bucket_size_2') %>% append_values_number("sampleProb")
bWeightDF['..JSON'] <-NULL #cleanup
bWeightDF$document.id <-NULL
bWeightDF <- data.frame(bWeightDF)
bWeightDF$weights <- 1/bWeightDF$sampleProb
bDF <- merge(x=bDF, y=bWeightDF, by="bucket_size_2", all.x=TRUE)

bDF.weighted <- bDF
bDF <- NULL #so that R will error if I miss an update below

print("run hierarchical models -- between-editor sample")

n.edits.bDF <- format(length(bDF.weighted$revid), format=f, big.mark=",")

bDF.weighted$logEditorNthEdit = log(bDF.weighted$editor_nth_edit_nocollapse)
bDF.weighted$underprod_factor <- -log(as.numeric(bDF.weighted$viewRankInMyMonth)/as.numeric(bDF.weighted$qualityRankInMyMonth))
summary(bDF.weighted$underprod_factor) #how does the range look?
max.bDF.uf <- max(bDF.weighted$underprod_factor)
min.bDF.uf <- min(bDF.weighted$underprod_factor)


bDF.weighted$editor_id_or_ip <- as.character(bDF.weighted$editor_id_or_ip) #make sure it's not treating these as numeric
bDF.weighted$anon <- as.factor(bDF.weighted$anon)
bDFX.weighted = bDF.weighted[bDF.weighted$anon==FALSE,] 
bDFA.weighted = bDF.weighted[bDF.weighted$anon==TRUE,] 
(m0_hier.w <- lmer(underprod_factor ~ 1 + (1 | editor_id_or_ip), data=bDF.weighted, REML=FALSE, weights=bDF.weighted$weights)) #null model
(m1a_hier.w <- lmer(underprod_factor ~ poly(logEditorNthEdit, 2, raw=TRUE)*anon + (1 | editor_id_or_ip), data=bDF.weighted, REML=FALSE, weights=bDF.weighted$weights)) #random intercept
(m1a_hier.w.np <- lmer(underprod_factor ~ logEditorNthEdit*anon + (1 | editor_id_or_ip), data=bDF.weighted, REML=FALSE, weights=bDF.weighted$weights)) #random intercept
(h1_hier.w.np <- lmer(underprod_factor ~ logEditorNthEdit + (1 | editor_id_or_ip), data=bDF.weighted, REML=FALSE, weights=bDF.weighted$weights)) #random intercept
(h2_hier.w.np <- lmer(underprod_factor ~ anon + (1 | editor_id_or_ip), data=bDF.weighted, REML=FALSE, weights=bDF.weighted$weights)) #random intercept
(h12_hier.w.np <- lmer(underprod_factor ~ poly(logEditorNthEdit, 1, raw=TRUE)*anon + (1 | editor_id_or_ip), data=bDF.weighted, REML=FALSE, weights=bDF.weighted$weights)) #random intercept

## what's the median effect from our fit?

## what's the min?
## what's the max?
## let's pretend all our sims are someone with that effect

ranefDF <- as.data.frame(ranef(m1a_hier.w)) #has columns:           grpvar        term               grp     condval    condsd 
medianRanef <- median(ranefDF$condval)
medianCandidate <- ranefDF[ranefDF$condval==medianRanef,]
medianCandidate
#in case we're curious who the median is
bDF.weighted[bDF.weighted$editor_id_or_ip==medianCandidate,]


simDF.anon <- data.frame(logEditorNthEdit=seq(from=1, to=25, by=1), anon=TRUE, editor_id_or_ip=medianCandidate$grp)
simDF.ident <- data.frame(logEditorNthEdit=seq(from=1, to=25, by=1), anon=FALSE, editor_id_or_ip=medianCandidate$grp)

simDF <- rbind(simDF.ident, simDF.anon) ##for predicting marginal effects in the multilevel model
simDF$anon <- as.factor(simDF$anon)

fullSim.pred <- modelr::add_predictions(m1a_hier.w, data=simDF)
save(fullSim.pred, file=paste0(dataPath, "fullMarg.RData"), version=2)


#m2 is just identifieds
m2_hier.w <- lmer(underprod_factor ~ poly(logEditorNthEdit, 2, raw=TRUE) + (1 | editor_id_or_ip), data=bDFX.weighted, REML=FALSE, weights=bDFX.weighted$weights) #random intercept
#identDF.pred <- ggpredict(m2_hier.w, terms="logEditorNthEdit [all]", newdata=simDF.ident)
#save(identDF.pred, file=paste0(dataPath, "weightedIdentMarg.RData"), version=2)

#m3 is just anons
m3_hier.w <- lmer(underprod_factor ~ poly(logEditorNthEdit, 2, raw=TRUE) + (1 | editor_id_or_ip), data=bDFA.weighted, REML=FALSE, weights=bDFA.weighted$weights) #random intercept
#anonDF.pred <- ggpredict(m3_hier.w, terms="logEditorNthEdit [all]", newdata=simDF.anon)
#save(anonDF, file=paste0(dataPath, "weightedAnonMarg.RData"), version=2)

## extract raneffs and EDA them
### predict using median random effect (or mean)

print("How was the fit?")

summary(m0_hier.w)
summary(m1a_hier.w)
summary(m2_hier.w)
summary(m3_hier.w)
summary(h1_hier.w.np)
summary(h2_hier.w.np)
summary(h12_hier.w.np)
m1aHierCoefs.w <- summary(m1a_hier.w)$coefficients
anovaM1AH.w <- anova(m0_hier.w, m1a_hier.w) ##bingo!
anovaM1AH.w
summary(anovaM1AH.w)
anovaChiSq.w <- format.pval(anovaM1AH.w[2,8], digits=3, eps=.00001) #if less than .00001, round up to < .00001
BIC.h0.w = anovaM1AH.w[1,3]
BIC.h1.w = anovaM1AH.w[2,3]
BIC.delta.nice.w = round(BIC.h0.w-BIC.h1.w)

ranovaM1AH.w <- ranova(m1a_hier.w) ##check the random effects too

resM1.w <- summary(m1a_hier.w)
muAlpha.w <- resM1.w$coefficients[1] 


##Mako says a meaningful ICC is the model with no predictors other than the grouping var, so that's m0_hier 
sigma2s <- data.frame(VarCorr(m0_hier.w))[,"vcov"] 
sigma2alpha <- sigma2s[1]
sigma2y <- sigma2s[2]
icc.w <- sigma2alpha / (sigma2alpha + sigma2y)
ricc.w <- round(icc.w, 4)
#icc is on a scale of 0 to 1, measures internal similarity of an editor
#icc is ratio of between cluster variance to the total variance: how much of
#the variance in producedness is due to individual internal variation and how much is
#between individuals?
#low ICC indicates that the individual is NOT explaining a lot of what we're seeing
#high ICC indicates that the individual IS explaining a lot of what we're seeing

print("Making hierarchical table")
#####update/fix depending on poly or no

screenreg(m1a_hier.w, digits=4, omit.coef = 'factor', ci.force = TRUE)

con <- textConnection("betweenAlignTexWeighted", "w") #how we save R objects to strings 
sink(con, split=TRUE, type="output") 
texreg(list(h12_hier.w.np, m1a_hier.w), 
	omit.coef = 'factor', 
	#custom.coef.names = c('Intercept', 'Revision Count (ln)', 'Revision Count (ln$^2$)', 'Editor was IP-based', 'Revision Count (ln) and IP-based', 'Revision Count (ln$^2$) and IP-based'), 
       	custom.model.names = c('Linear Model', 'Polynomial Model'), 
       	use.packages=FALSE,
	ci.test=NULL,
       	table=FALSE,
       	digits=4,
       	ci.force = TRUE)

sink()
close(con);rm(con)

con <- textConnection("betweenAlignTexWeighted.np", "w") #how we save R objects to strings 
sink(con, split=TRUE, type="output") 
texreg(list(h1_hier.w.np, h2_hier.w.np, h12_hier.w.np), omit.coef = 'factor', 
#custom.coef.names = c('Intercept', 'Revision Count (ln)', 'Editor was IP-based', 'Revision Count (ln) and IP-based'), 
       custom.model.names = c('Underproduction and Experience', 'Underproduction and IP-based', 'Combined Model'), 
       use.packages=FALSE,
ci.test=NULL,
       table=FALSE,
       digits=4,
              ci.force = TRUE)

sink()
close(con);rm(con)

#maybe look at just reverts, and editor nth revert

bDF.weighted$bucket_size_2 <- ordered(bDF.weighted$bucket_size_2, levels=c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21))


combo.summary.bDF.weighted <- summary(bDF.weighted$weighted_sum)
#clean out anything not needed in latex


#set up conditions for hypothetical using weighted model
## need to check this over
h.editsPerMonth.raw = 4500000 #4.5 mil
h.editsPerMonth.nice = format(h.editsPerMonth.raw, format=f, big.mark=",") #August 2019 figure, humans only
#https://stats.wikimedia.org/v2/#/en.wikipedia.org/contributing/user-edits/normal|bar|1-month|page_type~content|daily
h.viewsPerMonth.raw = 8960173622
h.viewsPerMonth.nice = format(h.viewsPerMonth.raw, format=f, big.mark=",") #August 2019 figure, enwiki only
#https://stats.wikimedia.org/v2/#/en.wikipedia.org/reading/total-page-views/normal|bar|1-year|~total|monthly
h.newbieEdits <- 1 
h.expertEdits <- 500 

## extract key quantities for hypotheticals

h.newbie.p.w <- summary(m1a_hier.w)$coefficients[1,1] + 
  summary(m1a_hier.w)$coefficients[2,1]*(log(h.newbieEdits)) + 
  summary(m1a_hier.w)$coefficients[3,1]*(log(h.newbieEdits)*log(h.newbieEdits))
h.newbie.p.nice.w = round(h.newbie.p.w, 4)
#predicted underprod value for a newer editor

#why is this var called 'p'????

h.expert.p.w <- summary(m1a_hier.w)$coefficients[1,1] + 
  summary(m1a_hier.w)$coefficients[2,1]*(log(h.expertEdits)) + 
  summary(m1a_hier.w)$coefficients[3,1]*(log(h.expertEdits)*log(h.expertEdits)) #squared term
h.expert.p.nice.w = round(h.expert.p.w, 4)
#predicted underprod value for an experienced editor


###### this step is supposed to get the number of views to an article of a given underprod value
###### but with the new approach this actually no longer makes sense, that's why it was giving bogus numbers in the old version of the paper
###### 
###### so what we need is to look at the data and find some examples at the median quality level that corresponds to the median quality level in the dataset
###### + / - .001, i.e. .01% of the range 0 to 5




expertLike <- subset(bDF.weighted, bDF.weighted$underprod_factor < h.expert.p.nice.w + .01)
expertLike <- subset(expertLike, expertLike$underprod_factor > h.expert.p.nice.w - .01)
h.expertLike.medviews <- median(expertLike$adjViews)
h.expertLike.quality <- median(expertLike$weighted_sum) 

newbieLike <- subset(bDF.weighted, bDF.weighted$underprod_factor < h.newbie.p.nice.w + .01)
newbieLike <- subset(newbieLike, newbieLike$underprod_factor > h.newbie.p.nice.w - .01)
h.newbieLike.quality <- median(newbieLike$weighted_sum) 
h.newbieLike.medviews <- median(newbieLike$adjViews)





#h.qualityModelIntercept.w = summary(m1a_hier.w)$coefficients[1,1]
#h.betaViews.w = summary(m1a_hier.w)$coefficients[2,1]

#h.newbieArticleViews.w <- exp((h.quality-h.newbie.p.w - h.qualityModelIntercept.w)/h.betaViews.w)
#h.newbieArticleViews.nice.w <- round(h.newbieArticleViews.w)

#h.expertArticleViews.w <- exp((h.quality-h.expert.p.w - h.qualityModelIntercept.w)/h.betaViews.w)
#h.expertArticleViews.nice.w <- round(h.expertArticleViews.w)

h.viewDelta.nice.w = h.expertLike.medviews - h.newbieLike.medviews

#options(scipen=999)
h.netImpact.raw.w = h.viewDelta.nice.w * h.editsPerMonth.raw
h.netImpact.nice.w = format(h.netImpact.raw.w, format="f", big.mark=",")

h.percentImpact.raw.w = 100* (h.netImpact.raw.w/h.viewsPerMonth.raw)
h.percentImpact.nice.w = round(h.percentImpact.raw.w, 3)


### here's where I am building a sample to use for the visualizations of the data / linear model / poly model discussion
###  columns are: -- observed nth edit, observed underprod, anon, linear-predicted underprod, poly-predicted underprod
### start with the entire dataframe

bDF.weighted$predict.poly <- predict(m1a_hier.w)
bDF.weighted$predict.linear <- predict(h12_hier.w.np)
bDF.temp <- data.frame("anon" = bDF.weighted$anon, "logEditorNthEdit" = bDF.weighted$logEditorNthEdit, "underprod_factor" = bDF.weighted$underprod_factor, "predict.linear" = bDF.weighted$predict.linear, "predict.poly" = bDF.weighted$predict.poly)
bDF.sample <- sample_n(bDF.temp, .1*dim(bDF.temp)[1])
save(bDF.sample, file="modelCompare.RData", version=2)

###model comparisons

linPolyAnova <- anova(m1a_hier.w, h12_hier.w.np, test = "LRT")
linPolyAnova


### may want to remove if not used downstream TODO
load('cleanMassJoinT.RData') #contains tDF 



if (!nosave) {
r <- list()

#descriptive statistics
remember(linPolyAnova)
remember(combo.summary.bDF.weighted)
remember(max.bDF.uf)
remember(min.bDF.uf)

#coefs
remember(m1aHierCoefs.w)
remember(BIC.delta.nice.w)
remember(ricc.w)

##begin hypotheticals
# fixed
remember(h.editsPerMonth.nice)
remember(h.editsPerMonth.raw)
remember(h.expertEdits)
remember(h.newbieEdits)
remember(h.newbie.p.nice.w)
remember(h.expert.p.nice.w)
# from the model output / dataset
remember(h.expertLike.quality)
remember(h.newbieLike.quality)
remember(h.expertLike.medviews)
remember(h.newbieLike.medviews)
remember(h.viewDelta.nice.w)

remember(h.netImpact.nice.w)
remember(h.netImpact.raw.w)
remember(h.percentImpact.nice.w)
remember(h.percentImpact.raw.w)
## end hypotheticals

remember(betweenAlignTexWeighted.np)
remember(betweenAlignTexWeighted)
remember(n.edits.bDF)

save(r, file=paste0(dataPath, "between_weighted.RData"), version=2)
print("Remembrances complete.")
rm(r)

}

